library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer) ; library(ggpubr)
library(biomaRt)
library(caret) ; library(ROCR) ; library(car) ; library(MLmetrics)
library(corrplot)
library(expss) ; library(knitr)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load and prepare data


Load dataset (preprocessing code in 20_04_07_create_dataset.html)

# Clusterings created by WGCNA
clustering_selected = 'DynamicHybrid'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname
assigned_module = clusterings %>% dplyr::select(ID, Module)

# Dataset created with 20_04_07_create_dataset.html
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
dataset$Module = clusterings$Module

# Update gene scores to new SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
gene_scores = dataset %>% mutate(ID = rownames(.)) %>% 
              left_join(SFARI_genes %>% dplyr::select(ID, `gene-score`)) %>% pull(`gene-score`)
rownames_dataset = rownames(dataset)
dataset = dataset %>% mutate(gene.score = gene_scores)
rownames(dataset) = rownames_dataset

# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)

rm(getinfo, mart, rownames_dataset, GO_annotations)


Gene filtering:


  • Remove genes without cluster (Module=gray)
rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character

print(paste0('Removing ', sum(dataset$Module=='gray'), ' genes without cluster'))
## [1] "Removing 113 genes without cluster"
new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor))


Variable changes:


  • Using Module Membership variables instead of binary module membership

  • Not including p-value variables

  • Including a new variable with the absolute value of GS

  • Removing information from gray module (unclassified genes)

  • Objective variable: Binary label indicating if it’s in the SFARI dataset or not

new_dataset = new_dataset %>% dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
              mutate('absGS'=abs(GS), 'SFARI'=!is.na(gene.score)) %>%
              dplyr::select(-gene.score)

rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']

rm(rm_cluster)
original_dataset = dataset
dataset = new_dataset
print(paste0('The final dataset contains ', nrow(dataset), ' observations and ', ncol(dataset), ' variables'))
## [1] "The final dataset contains 16034 observations and 58 variables"
rm(new_dataset)



Exploratory Analysis


PCA of Variables

The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and both SFARI and absGS in the middle of both groups

mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')

pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(dataset), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
                       type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
                            ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
                            ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))



plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(aes(id=ID)) +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
         scale_colour_distiller(palette = 'RdBu', na.value = 'darkgrey') + theme_minimal() +
         ggtitle('PCA of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, tsne)


PCA of Samples

  • The two main patterns that seem to characterise the genes are their Gene Significance and the Module-Diagnosis correlation of their corresponding module

  • Mean Expression doesn’t seem to play an important role

  • I don’t know what the 2nd PC is capturing

# Mean Expression data
load('./../Data/preprocessed_data.RData')
## Registered S3 methods overwritten by 'Hmisc':
##   method                 from 
##   [.labelled             expss
##   print.labelled         expss
##   as.data.frame.labelled expss
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))

# PCA
pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2], 
                       'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
            mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)



Resampling to reduce class imbalance


4.91% of the observations are positive

print(table(dataset$SFARI))
## 
## FALSE  TRUE 
## 15247   787
#cat(paste0('\n',round(mean(dataset$SFARI)*100,2), '% of the observations are positive'))

For now, will do this using over- and under-sampling of the classes, but later on should check SMOTE (Synthetic Minority Over-sampling Technique) method

Need to divide first into train and test sets to keep the sets independent: using 80% of the Positive observations on the training set

Note: Even though our label is binary, I want to have representative samples for all SFARI scores in both the training and test data, so instead of pooling all the SFARI scores together and randomly selecting 80% of the samples, I’m going to create the positive set selecting 80% of each of the samples by score

set.seed(123)

positive_sample_balancing_SFARI_scores = function(p){
  
  positive_train_idx = c()
  positive_test_idx = c()
  
  for(score in 1:3){
    score_genes = rownames(original_dataset)[rownames(original_dataset) %in% rownames(dataset) & original_dataset$gene.score == score]
    score_idx = which(rownames(dataset) %in% score_genes)
    score_train_idx = sample(score_idx, size = ceiling(p*length(score_idx)))
    score_test_idx = score_idx[!score_idx %in% score_train_idx]
    
    positive_train_idx = c(positive_train_idx, score_train_idx)
    positive_test_idx = c(positive_test_idx, score_test_idx) 
  }
  
  return(list('train' = sort(positive_train_idx), 'test' = sort(positive_test_idx)))
}

# 80% of the samples for the training set
p = 0.8

positive_idx = positive_sample_balancing_SFARI_scores(p)
positive_train_idx = positive_idx[['train']]
positive_test_idx = positive_idx[['test']]

negative_idx = which(!dataset$SFARI)
negative_train_idx = sort(sample(negative_idx, size=ceiling(p*length(negative_idx))))
negative_test_idx = negative_idx[!negative_idx %in% negative_train_idx] # This is overwritten below because of the subbsampling

train_set = dataset[c(positive_train_idx, negative_train_idx),]
test_set = dataset[c(positive_test_idx, negative_test_idx),] # This is overwritten below because of the subbsampling

rm(positive_idx, negative_idx, positive_train_idx, positive_test_idx, negative_train_idx, negative_test_idx,
   p, positive_sample_balancing_SFARI_scores)


Balancing the dataset to obtain a 1:1 ratio in labels


Over-sampling observations with positive SFARI label: Sample with replacement 4x original number of observations

Sample with replacement positive observations in train set

positive_obs = which(train_set$SFARI)

add_obs = sample(positive_obs, size=3*length(positive_obs), replace=TRUE)

train_set = train_set[c(1:nrow(train_set), add_obs),]

rm(positive_obs, add_obs)

Under-sampling observations with negative SFARI labels

cat(paste0('Keeping ~',round(100*sum(train_set$SFARI)/sum(!train_set$SFARI)),
             '% of the Negative observations in the training set'))
## Keeping ~21% of the Negative observations in the training set
negative_obs = which(!train_set$SFARI)

keep_obs = sample(negative_obs, size=sum(train_set$SFARI))

train_set = train_set[c(keep_obs, which(train_set$SFARI)),]

# Add observations we removed from the training data to the test data
lost_obs = !rownames(dataset) %in% c(rownames(train_set), rownames(test_set))
cat(paste0('Adding ', sum(lost_obs), ' Negative samples to the test set from the ones we just removed from the training set'))
## Adding 9674 Negative samples to the test set from the ones we just removed from the training set
test_set = rbind(test_set, dataset[lost_obs,])


rm(negative_obs, keep_obs)

Label distribution in training set

cro(train_set$SFARI)
 #Total 
 train_set$SFARI 
   FALSE  2524
   TRUE  2524
   #Total cases  5048

Labels distribution in test set

cro(test_set$SFARI)
 #Total 
 test_set$SFARI 
   FALSE  12723
   TRUE  156
   #Total cases  12879


Logistic Regression


Train model

train_set$SFARI = train_set$SFARI %>% as.factor

fit = glm(SFARI~., data=train_set, family='binomial')

The features are strongly correlated, which inflates the standard error of the coefficients, making them no longer interpretable

Variance Inflation Factor (VIF) and correlation plot

# VIF
plot_data = data.frame('Feature' = car::vif(fit) %>% sort %>% names,
                       'VIF' = car::vif(fit) %>% sort %>% unname) %>%
            mutate(outlier = VIF>10)

plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') + scale_y_log10() +
              geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') + xlab('Model Features') + theme_minimal() +
              theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))

# Correlation plot
corrplot.mixed(cor(train_set[,-ncol(train_set)]), lower = 'number', lower.col = 'gray', number.cex = .6, 
               tl.pos = 'l', tl.col = '#666666')

rm(getinfo, mart, clusterings)

Possible solutions to Multicollinearity:


  1. Remove all variables with a VIF>10: We would lose all but two of our variables, not ideal

  2. Do Principal Component Regression: We would lose the relation between the prediction and the original features, which could be interesting to study

  3. Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression

  4. Use Ridge Regression: The penalty it gives to high coefficients reduces the variance introduced by the correlation, making the coefficients interpretable again


Ridge Regression


Notes:

### DEFINE FUNCTIONS

# Create positive training set including all SFARI scores
positive_sample_balancing_SFARI_scores = function(p, seed){
  
  set.seed(seed)
  positive_train_idx = c()
  
  for(score in 1:3){
    score_genes = rownames(original_dataset)[rownames(original_dataset) %in% rownames(dataset) & original_dataset$gene.score == score]
    score_idx = which(rownames(dataset) %in% score_genes)
    score_train_idx = sample(score_idx, size = ceiling(p*length(score_idx)))
    
    positive_train_idx = c(positive_train_idx, score_train_idx)
  }
  
  return(positive_train_idx)
}

create_train_test_sets = function(p, over_sampling_fold, seed){
  
  ### CREATE POSITIVE TRAINING SET (balancing SFARI scores and over-sampling)
  positive_train_idx = positive_sample_balancing_SFARI_scores(p, seed)
  add_obs = sample(positive_train_idx, size = ceiling(over_sampling_fold*length(positive_train_idx)), replace=TRUE)
  positive_train_idx = c(positive_train_idx, add_obs)
  
  
  ### CREATE NEGATIVE TRAINING SET
  negative_idx = which(!dataset$SFARI)
  negative_train_idx = sample(negative_idx, size = length(positive_train_idx))
  
  
  ### CREATE TRAIN AND TEST SET
  train_set = dataset[sort(c(positive_train_idx, negative_train_idx)),]
  test_set = dataset[-unique(c(positive_train_idx, negative_train_idx)),]
  
  return(list('train_set' = train_set, 'test_set' = test_set))
  
}

run_model = function(p, over_sampling_fold, seed){
  
  # Create train and test sets
  train_test_sets = create_train_test_sets(p, over_sampling_fold, seed)
  train_set = train_test_sets[['train_set']] %>% data.frame
  test_set = train_test_sets[['test_set']]
  
  # Train Model
  train_set$SFARI = train_set$SFARI %>% as.factor
  lambda_seq = 10^seq(1, -4, by = -.1)
  set.seed(123)
  fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = trainControl('cv', number = 10),
              tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
  
  # Predict labels in test set
  predictions = fit %>% predict(test_set, type='prob')
  preds = data.frame('ID'=rownames(test_set), 'prob'=predictions$`TRUE`) %>% mutate(pred = prob>0.5)

  # Measure performance of the model
  acc = mean(test_set$SFARI==preds$pred)
  prec = Precision(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  rec = Recall(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  F1 = F1_Score(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  pred_ROCR = prediction(preds$prob, test_set$SFARI)
  AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
  
  # Extract coefficients from features
  coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
  
  return(list('acc' = acc, 'prec' = prec, 'rec' = rec, 'F1' = F1, 
              'AUC' = AUC, 'preds' = preds, 'coefs' = coefs))
}


### RUN MODEL

# Parameters
p = 0.8
over_sampling_fold = 3
n_iter = 100
seeds = 123:(123+n_iter-1)

# Store outputs
acc = c()
prec = c()
rec = c()
F1 = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)

for(seed in seeds){
  
  # Run model
  model_output = run_model(p, over_sampling_fold, seed)
  
  # Update outputs
  acc = c(acc, model_output[['acc']])
  prec = c(prec, model_output[['prec']])
  rec = c(rec, model_output[['rec']])
  F1 = c(F1, model_output[['F1']])
  AUC = c(AUC, model_output[['AUC']])
  preds = model_output[['preds']]
  coefs$coef = coefs$coef + model_output[['coefs']]
  update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
  predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in% 
                                                                      preds$ID, c('prob','pred','n')] +
                                                                    update_preds
}

coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)

rm(p, over_sampling_fold, seeds, update_preds, positive_sample_balancing_SFARI_scores, create_train_test_sets, run_model)


To summarise in a single value the predictions of the models:

test_set = predictions %>% filter(n>0) %>% left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]


Performance metrics


Confusion matrix

conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels', 
                                     prob = 'Assigned Probability', 
                                     pred = 'Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 Label Prediction     #Total 
 FALSE   TRUE   
 Actual Labels 
   FALSE  10038 5209   15247
   TRUE  297 490   787
   #Total cases  10335 5699   16034
rm(conf_mat)


Accuracy: Mean = 0.654 SD = 0.0077


Precision: Mean = 0.0216 SD = 0.0012


Recall: Mean = 0.621 SD = 0.0375


F1 score: Mean = 0.0417 SD = 0.0023


ROC Curve: Mean = 0.6895 SD = 0.019

pred_ROCR = prediction(test_set$prob, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(auc,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')


Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)




Coefficients


MTcor and absGS have a very small coefficient and Gene Significance has a negative coefficient

gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>% left_join(assigned_module, by ='ID') %>%
                 mutate(Module = gsub('#','',Module))

coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>% left_join(gene_corr_info, by = c('feature' = 'Module')) %>% 
            dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>% summarise('SFARI_perc' = mean(SFARI)) %>%
            arrange(desc(coef))

kable(coef_info %>% dplyr::select(feature, coef) %>% rename('Feature' = feature, 'Coefficient' = coef),
      align = 'cc', caption = 'Regression Coefficients')
Regression Coefficients
Feature Coefficient
00A8FF 0.7185317
B79F00 0.5181573
8195FF 0.4979092
EE67EC 0.4900560
F8766D 0.4773346
00BECF 0.4050179
BF80FF 0.3426857
00C083 0.2772603
FE61CF 0.2509699
2EA2FF 0.2411567
00BF74 0.2379291
619CFF 0.2307058
00BFC4 0.2249593
DB8E00 0.2247468
00B9E3 0.2191021
FA62DA 0.2113841
00ADFA 0.2071333
00BC50 0.2070501
00C0B9 0.1970214
73B000 0.1921302
FC727E 0.1835976
D39200 0.1652660
EE8045 0.1572418
FF66AA 0.1432425
CA9700 0.0990985
A0A600 0.0951683
00BBDA 0.0928223
00C19F 0.0557449
FF699C 0.0447042
FE6D8D 0.0378138
DB72FB 0.0110478
MTcor -0.0156356
998EFF -0.0189111
absGS -0.0476225
F564E3 -0.0574365
00B80F -0.0628890
ACA300 -0.0716275
CE79FF -0.0770455
GS -0.0945027
84AD00 -0.1000193
FF61C3 -0.1217880
00C092 -0.1338882
93AA00 -0.1387393
F37B5A -0.1415803
00BD63 -0.1497667
FF63B7 -0.1650215
AE87FF -0.1673525
5EB300 -0.1722345
E56CF4 -0.1812170
E88526 -0.2018223
41B500 -0.2118645
Intercept -0.2310382
00BA38 -0.2410595
00B6EC -0.3506428
C19B00 -0.3714003
00B2F4 -0.4233121
E28900 -0.4517536
00C1AC -0.5237469

  • There is a positive relation between the coefficient assigned to the membership of each module and the percentage of SFARI genes that are assigned to that module
ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, SFARI_perc)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)])) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('% of SFARI genes in Module'))


  • There doesn’t seem to be a relation between the coefficient and the correlation of the module and the diagnosis.

This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)

ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, MTcor)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)])) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('Module-Diagnosis correlation'))




Analyse model


SFARI genes have a higher score distribution than the rest, but the overlap is large

plot_data = test_set %>% dplyr::select(prob, SFARI)

ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
         geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype = 'dashed') +
         geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype = 'dashed') +
         theme_minimal() + ggtitle('Model score distribution by SFARI Label'))
  • There seems to be a small but consistent positive relation between the SFARI scores and the probability assigned by the model
plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            mutate(gene.score = ifelse(is.na(gene.score), ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                       gene.score)) %>%
            dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  143
   2  205
   3  439
   Neuronal  933
   Others  14314
   #Total cases  16034
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))

ggplotly(plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI score') + ylab('Probability') + theme_minimal())
comparisons = list(c('1','Neuronal'), c('2','Neuronal'), c('3','Neuronal'), c('1','Others'), c('2','Others'),
                   c('3','Others'), c('1','2'), c('2','3'), c('Neuronal', 'Others'))

plot_data %>% ggplot(aes(gene.score, prob, color = gene.score)) + 
              stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') + 
              stat_summary(fun = mean, geom = 'point', size = 3) + 
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), 
                                 label.y = c(.8, .75, .7, .95, .9, .85, rep(1, 3)), tip.length = 0.01) +
                                 #label.y = c(.5, .45, .4, .7, .65, .6, rep(0.8, 3)), tip.length = 0.01) +
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI Score') + ylab('Probability') +
              scale_colour_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
              theme_minimal() + theme(legend.position = 'none')

rm(mean_vals)

Genes with highest scores in test set


  • Considering the class imbalance in the test set (1:19), there are many more SFARI scores in here (1:3)

  • 14/18 SFARI Genes in this list belong to the SFARI Score 1, 2 to Score 2 and only 1 to Score 3

test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>%
             mutate(gene.score = ifelse(is.na(gene.score), ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                        gene.score)) %>%
             left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
             dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' = MTcor, 'GeneSignificance' = GS) %>%
             mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr,4), Probability = round(Probability,4), 
                    GeneSignificance = round(GeneSignificance,4)) %>%
             dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability, gene.score) %>%
             kable(caption = 'Genes with highest model probabilities from the test set')
Genes with highest model probabilities from the test set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
RPRD2 -0.0922 -0.5528 #FE61CF 0.8747 Others
MBD5 0.1943 0.0586 #B79F00 0.8734 1
MYCBP2 -0.3975 -0.4939 #8195FF 0.8706 Neuronal
EP300 -0.0234 0.1127 #00A8FF 0.8682 1
TANC2 -0.2348 -0.5528 #FE61CF 0.8665 1
RAPH1 -0.0534 -0.4939 #8195FF 0.8571 Others
KMT2A 0.1347 0.5941 #619CFF 0.8551 1
NFAT5 0.1035 0.0586 #B79F00 0.8547 Others
ANK2 -0.4368 -0.9355 #00B9E3 0.8528 1
PRDM2 -0.2518 0.0586 #B79F00 0.8520 Others
RFX7 0.1372 0.0586 #B79F00 0.8519 Others
ARID1B 0.2711 0.1127 #00A8FF 0.8499 1
C20orf112 0.0314 0.1127 #00A8FF 0.8418 Others
EP400 -0.1671 -0.5528 #FE61CF 0.8416 2
SRRM4 -0.4400 -0.6877 #BF80FF 0.8416 Others
FMNL1 -0.2223 -0.6877 #BF80FF 0.8404 Others
ANK3 -0.4814 0.0586 #B79F00 0.8401 1
HIVEP2 0.0165 -0.9355 #00B9E3 0.8393 1
RNF111 -0.2410 0.0586 #B79F00 0.8380 Others
CREBBP 0.1431 0.1127 #00A8FF 0.8349 1
HUNK -0.3273 -0.5100 #F8766D 0.8333 Others
KMT2D -0.3255 -0.5528 #FE61CF 0.8324 Others
HECTD4 -0.3170 -0.5528 #FE61CF 0.8320 1
KCNJ6 -0.1379 -0.9355 #00B9E3 0.8310 Others
TLE3 0.4884 0.1127 #00A8FF 0.8308 Others
GRIN2B -0.2761 -0.7497 #FF66AA 0.8296 1
ASAP1 -0.0896 -0.6877 #BF80FF 0.8296 Others
KIAA1109 -0.1173 -0.0094 #00BECF 0.8277 Others
GATAD2B -0.4221 -0.5528 #FE61CF 0.8275 Others
SRCAP -0.3623 -0.5528 #FE61CF 0.8245 1
CELF2 -0.0605 -0.9355 #00B9E3 0.8239 Others
DROSHA -0.4111 -0.5528 #FE61CF 0.8235 Others
SAP130 -0.2482 -0.5528 #FE61CF 0.8225 Others
TP53INP2 -0.2140 -0.4946 #2EA2FF 0.8215 Others
ARID1A -0.1378 -0.5528 #FE61CF 0.8213 Others
DAAM1 0.1579 -0.0094 #00BECF 0.8209 Others
CACNG3 -0.4689 -0.6877 #BF80FF 0.8199 Others
BAZ2A 0.0534 -0.0094 #00BECF 0.8189 Others
PCLO -0.1879 0.0586 #B79F00 0.8188 Others
KMT2E 0.0723 0.5941 #619CFF 0.8176 2
TLN2 -0.4783 -0.7497 #FF66AA 0.8169 Others
ZNF804A -0.2768 -0.5100 #F8766D 0.8168 2
RIMBP2 0.2615 -0.0094 #00BECF 0.8167 Others
DLGAP4 -0.3307 -0.5528 #FE61CF 0.8165 Others
POGZ 0.0391 0.1127 #00A8FF 0.8164 1
ETV6 -0.1418 -0.6877 #BF80FF 0.8163 Others
MED13L 0.3795 0.0586 #B79F00 0.8160 1
AMPD3 -0.2810 -0.9355 #00B9E3 0.8158 Others
NAV1 -0.1734 -0.5528 #FE61CF 0.8154 Neuronal
GABBR2 -0.6249 -0.9355 #00B9E3 0.8151 3




Negative samples distribution


Selecting the Negative samples in the test set

negative_set = test_set %>% filter(!SFARI)

negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability', 
                                                   pred = 'Label Prediction')
cro(negative_set_table$pred)
 #Total 
 Label Prediction 
   FALSE  10038
   TRUE  5209
   #Total cases  15247
cat(paste0('\n', sum(negative_set$pred), ' genes are predicted as ASD-related'))
## 
## 5209 genes are predicted as ASD-related
negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
                 geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + xlab('Probability') +
                 ggtitle('Probability distribution of the Negative samples in the Test Set') + 
                 theme_minimal()


Probability and Gene Significance


  • There’s a lot of noise, but the probability the model assigns to each gene seems to have a negative relation with the Gene Significance (under-expressed genes having on average the higher probabilities and over-expressed genes the lowest)

  • The pattern is stronger in under-expressed genes

negative_set %>% ggplot(aes(prob, GS, color = MTcor)) + geom_point() + 
                 geom_smooth(method = 'loess', color = '#666666') +
                 geom_hline(yintercept = 0, color='gray', linetype='dashed') + xlab('Probability') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Probability and Gene Significance') + theme_minimal()


Probability and Module-Diagnosis correlation


  • There’s not a strong relation between the Module-Diagnosis correlation of the genes assigned module and the probability assigned by the model

  • The model seems to assign slightly higher probabilities to genes belonging the modules with negative module-Dianosis correlations than to genes belonging to modules with positive ones

negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + 
                 geom_smooth(method='loess', color='#666666') + 
                 geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) + 
                 xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
                 theme_minimal()

Summarised version, plotting by module instead of by gene

The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

The model seems to give higher probabilities to genes belonging to modules with a small (absolute) correlation to Diagnosis (this is unexpected)

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% 
            left_join(original_dataset, by='MTcor') %>%
            dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID), alpha=0.7) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Model') + theme_minimal())


Probability and level of expression


There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('Mean Expression') + 
              ylab('Probability') + ggtitle('Mean expression vs model probability by gene') +
              theme_minimal()

rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), 
                                                          n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + 
         geom_point(color=plot_data2$Module, alpha = 0.7) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         geom_smooth(method='lm', se=FALSE, color='gray') + theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)


Probability and lfc


There is a relation between probability and lfc, so it **IS*() capturing a bit of true information (because lfc and mean expression were negatively correlated and it still has a positive relation in the model)

  • The relation is stronger in genes under-expressed in ASD
plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')

plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              theme_minimal() + ggtitle('LFC vs model probability by gene')

  • The relation is stronger in Differentially Expressed genes
p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>% ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) + 
                   geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + 
                   ylim(c(min(plot_data$prob), max(plot_data$prob))) + 
                   theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))

p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>% ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) + 
                   geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + ylab('') +
                   scale_y_continuous(position = 'right', limits = c(min(plot_data$prob), max(plot_data$prob))) +
                   theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())

grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')

rm(p1, p2)



Conclusion


The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots)


Saving results

predictions = test_set

save(predictions, dataset, file='./../Data/Ridge_model_robust_new_SFARI.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] expss_0.10.2       corrplot_0.84      MLmetrics_1.1.1    car_3.0-7         
##  [5] carData_3.0-3      ROCR_1.0-7         gplots_3.0.3       caret_6.0-86      
##  [9] lattice_0.20-41    biomaRt_2.40.5     ggpubr_0.2.5       magrittr_1.5      
## [13] RColorBrewer_1.1-2 gridExtra_2.3      viridis_0.5.1      viridisLite_0.3.0 
## [17] plotly_4.9.2       knitr_1.28         forcats_0.5.0      stringr_1.4.0     
## [21] dplyr_0.8.5        purrr_0.3.3        readr_1.3.1        tidyr_1.0.2       
## [25] tibble_3.0.0       ggplot2_3.3.0      tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.4-0                 plyr_1.8.6                 
##   [5] lazyeval_0.2.2              splines_3.6.3              
##   [7] BiocParallel_1.18.1         crosstalk_1.1.0.1          
##   [9] GenomeInfoDb_1.20.0         digest_0.6.25              
##  [11] foreach_1.5.0               htmltools_0.4.0            
##  [13] gdata_2.18.0                fansi_0.4.1                
##  [15] checkmate_2.0.0             memoise_1.1.0              
##  [17] cluster_2.1.0               openxlsx_4.1.4             
##  [19] annotate_1.62.0             recipes_0.1.10             
##  [21] modelr_0.1.6                gower_0.2.1                
##  [23] matrixStats_0.56.0          prettyunits_1.1.1          
##  [25] jpeg_0.1-8.1                colorspace_1.4-1           
##  [27] blob_1.2.1                  rvest_0.3.5                
##  [29] haven_2.2.0                 xfun_0.12                  
##  [31] crayon_1.3.4                RCurl_1.98-1.1             
##  [33] jsonlite_1.6.1              genefilter_1.66.0          
##  [35] survival_3.1-12             iterators_1.0.12           
##  [37] glue_1.3.2                  gtable_0.3.0               
##  [39] zlibbioc_1.30.0             ipred_0.9-9                
##  [41] XVector_0.24.0              DelayedArray_0.10.0        
##  [43] shape_1.4.4                 BiocGenerics_0.30.0        
##  [45] abind_1.4-5                 scales_1.1.0               
##  [47] DBI_1.1.0                   miniUI_0.1.1.1             
##  [49] Rcpp_1.0.4.6                xtable_1.8-4               
##  [51] progress_1.2.2              htmlTable_1.13.3           
##  [53] foreign_0.8-76              bit_1.1-15.2               
##  [55] Formula_1.2-3               stats4_3.6.3               
##  [57] lava_1.6.7                  prodlim_2019.11.13         
##  [59] glmnet_3.0-2                htmlwidgets_1.5.1          
##  [61] httr_1.4.1                  acepack_1.4.1              
##  [63] ellipsis_0.3.0              pkgconfig_2.0.3            
##  [65] XML_3.99-0.3                farver_2.0.3               
##  [67] nnet_7.3-14                 dbplyr_1.4.2               
##  [69] locfit_1.5-9.4              later_1.0.0                
##  [71] tidyselect_1.0.0            labeling_0.3               
##  [73] rlang_0.4.5                 reshape2_1.4.3             
##  [75] AnnotationDbi_1.46.1        munsell_0.5.0              
##  [77] cellranger_1.1.0            tools_3.6.3                
##  [79] cli_2.0.2                   generics_0.0.2             
##  [81] RSQLite_2.2.0               broom_0.5.5                
##  [83] fastmap_1.0.1               evaluate_0.14              
##  [85] yaml_2.2.1                  ModelMetrics_1.2.2.2       
##  [87] bit64_0.9-7                 fs_1.4.0                   
##  [89] zip_2.0.4                   caTools_1.18.0             
##  [91] nlme_3.1-147                mime_0.9                   
##  [93] ggExtra_0.9                 xml2_1.2.5                 
##  [95] compiler_3.6.3              rstudioapi_0.11            
##  [97] png_0.1-7                   curl_4.3                   
##  [99] e1071_1.7-3                 ggsignif_0.6.0             
## [101] reprex_0.3.0                geneplotter_1.62.0         
## [103] stringi_1.4.6               highr_0.8                  
## [105] Matrix_1.2-18               vctrs_0.2.4                
## [107] pillar_1.4.3                lifecycle_0.2.0            
## [109] data.table_1.12.8           bitops_1.0-6               
## [111] httpuv_1.5.2                GenomicRanges_1.36.1       
## [113] latticeExtra_0.6-29         R6_2.4.1                   
## [115] promises_1.1.0              KernSmooth_2.23-17         
## [117] rio_0.5.16                  IRanges_2.18.3             
## [119] codetools_0.2-16            MASS_7.3-51.6              
## [121] gtools_3.8.2                assertthat_0.2.1           
## [123] SummarizedExperiment_1.14.1 DESeq2_1.24.0              
## [125] withr_2.1.2                 S4Vectors_0.22.1           
## [127] GenomeInfoDbData_1.2.1      mgcv_1.8-31                
## [129] parallel_3.6.3              hms_0.5.3                  
## [131] grid_3.6.3                  rpart_4.1-15               
## [133] timeDate_3043.102           class_7.3-17               
## [135] rmarkdown_2.1               pROC_1.16.2                
## [137] shiny_1.4.0.2               base64enc_0.1-3            
## [139] Biobase_2.44.0              lubridate_1.7.4